data_clean = data %>%
select(-c(crm_cd_desc, crm_cd)) %>% # These fields have only 1 value
mutate(dr_no = as.character(dr_no), # These fields are not really numeric
area = as.character(area),
rpt_dist_no = as.character(rpt_dist_no),
premis_cd = as.character(premis_cd),
location = toupper(gsub(" +", " ", location)), # Remove extra spaces and make sure
cross_street = toupper(gsub(" +", " ", cross_street)), # fields are upper case
location_1 = gsub("\n, \n\\(", "", location_1), # Get latitude, longitude
location_1 = gsub(")", "", location_1),
lat = substr(location_1, start = 1, stop = as.numeric(gregexpr(",", location_1)) - 1),
lon = substr(location_1, start = as.numeric(gregexpr(",", location_1)) + 1,
stop = nchar(location_1)),
lat = as.numeric(lat),
lon = as.numeric(lon))
This document contains results of the LA Traffic Collisions data analysis.
Here are the key questions I’m interested in:
The data set contains info on traffic collision incidents in Los Angeles from January 2010 - August 2019. The data is transcribed from original traffic reports that are typed on paper and so may have errors.
The data is maintained by the city of Los Angeles and is updated weekly. The data has 485992 rows and 18 columns. More info can be found here.
Data Sample (selected fields):
data_clean %>%
select(dr_no, date_occ, time_occ, area_name, vict_age, vict_sex, vict_descent, location_1) %>%
head(3) %>%
kable %>% kable_styling()
| dr_no | date_occ | time_occ | area_name | vict_age | vict_sex | vict_descent | location_1 |
|---|---|---|---|---|---|---|---|
| 100100007 | 2010-11-08 | 2200 | Central | NA | F | O | 34.0395, -118.2656 |
| 100100767 | 2010-03-31 | 400 | Central | 21 | M | H | 34.0695, -118.2324 |
| 100100831 | 2010-04-18 | 140 | Central | 50 | F | H | 34.0424, -118.2718 |
First, I’ll take a general look at some of these variables.
ggplot(data = select(data_clean, vict_age), aes(x = vict_age)) +
geom_histogram(binwidth = 1) +
scale_x_continuous(breaks = seq(0, 100, 5)) +
labs(x = "Victim Age", y = "Count", title = "Histogram of Collisions by Victim Age")
data_clean %>%
group_by(vict_sex) %>%
summarize(total = n()) %>%
ungroup() %>%
filter(!(vict_sex %in% c("", "H", "N"))) %>%
ggplot(aes(x = vict_sex, y = total)) +
geom_bar(stat = "identity") +
labs(x = "Victim Sex", y = "Collisions", title = "Collisions by Victim Sex")
Below, I check earliest collision report date per area. Based on some of the plots later in the document, it seemed possible that certain areas weren’t reporting for the entire data set.
data %>%
group_by(area) %>% summarize(min_date = min(date_occ)) %>% ungroup() %>%
rename(Area = area, `Earliest Collision Report Date` = min_date) %>%
kable %>% kable_styling()
| Area | Earliest Collision Report Date |
|---|---|
| 1 | 2010-01-01 |
| 2 | 2010-01-01 |
| 3 | 2010-01-01 |
| 4 | 2010-01-01 |
| 5 | 2010-01-01 |
| 6 | 2010-01-01 |
| 7 | 2010-01-01 |
| 8 | 2010-01-01 |
| 9 | 2010-01-01 |
| 10 | 2010-01-01 |
| 11 | 2010-01-01 |
| 12 | 2010-01-01 |
| 13 | 2010-01-01 |
| 14 | 2010-01-01 |
| 15 | 2010-01-01 |
| 16 | 2010-01-01 |
| 17 | 2010-01-01 |
| 18 | 2010-01-01 |
| 19 | 2010-01-01 |
| 20 | 2010-01-01 |
| 21 | 2010-01-01 |
However, all area have data starting from the beginning of the reporting period. It is still possible that certain sections of an area were not reporting for the entire period.
This section looks at the days with the lowest/highest number of collisions.
# Create data set with collision count per day
outliers = data_clean %>%
filter(date_occ != "2019-08-17") %>% # This day looks like it has partial results
group_by(date_occ) %>% summarize(daily_count = n()) %>% ungroup()
Here are the days with the lowest number of collisions:
outliers %>%
filter(daily_count < quantile(outliers$daily_count, 0.005)) %>%
mutate(day = weekdays(date_occ),
hypothesis = c("New Year", "Presidents Day", "Thanksgiving", "Thanksgiving",
"Christmas / New Year", "Presidents Day", "Thanksgiving", "Christmas",
"New Years", "MLK Day", "Thanksgiving", "Christmas",
"Christmas / New Year", "Christmas / New Year", "Random Sunday",
"MLK Day", "Christmas"),
date_occ = as.character(date_occ))
## # A tibble: 17 x 4
## date_occ daily_count day hypothesis
## <chr> <int> <chr> <chr>
## 1 2010-01-02 79 Saturday New Year
## 2 2010-02-15 80 Monday Presidents Day
## 3 2010-11-25 73 Thursday Thanksgiving
## 4 2010-11-26 79 Friday Thanksgiving
## 5 2011-12-28 82 Wednesday Christmas / New Year
## 6 2012-02-20 84 Monday Presidents Day
## 7 2012-11-22 82 Thursday Thanksgiving
## 8 2012-12-25 68 Tuesday Christmas
## 9 2012-12-30 79 Sunday New Years
## 10 2013-01-21 81 Monday MLK Day
## 11 2013-11-28 71 Thursday Thanksgiving
## 12 2013-12-25 80 Wednesday Christmas
## 13 2013-12-28 81 Saturday Christmas / New Year
## 14 2013-12-29 86 Sunday Christmas / New Year
## 15 2014-01-12 86 Sunday Random Sunday
## 16 2015-01-19 79 Monday MLK Day
## 17 2018-12-25 83 Tuesday Christmas
Here are the days with the highest number of collisions:
outliers %>%
filter(daily_count > quantile(outliers$daily_count, 0.995)) %>%
mutate(day = weekdays(date_occ),
date_occ = as.character(date_occ))
## # A tibble: 17 x 3
## date_occ daily_count day
## <chr> <int> <chr>
## 1 2010-02-05 202 Friday
## 2 2010-12-17 219 Friday
## 3 2015-10-09 213 Friday
## 4 2015-11-20 208 Friday
## 5 2016-02-17 218 Wednesday
## 6 2016-12-15 223 Thursday
## 7 2017-09-29 204 Friday
## 8 2017-11-04 202 Saturday
## 9 2017-11-17 230 Friday
## 10 2017-12-01 219 Friday
## 11 2018-01-08 212 Monday
## 12 2018-01-09 204 Tuesday
## 13 2018-02-16 217 Friday
## 14 2018-03-02 205 Friday
## 15 2018-10-12 216 Friday
## 16 2018-11-30 202 Friday
## 17 2018-12-07 207 Friday
This section looks at how traffic collision patterns vary by time of day, day of week, and time of year.
Below are quantiles of the difference between the date a collision is reported vs. the date it occurred.
gap = data %>% mutate(gap = as.numeric(ymd(date_rptd) - ymd(date_occ)))
quantile(gap$gap)
## 0% 25% 50% 75% 100%
## 0 0 0 1 2922
Most collisions are reported on the same day they occurred, however some are not reported for years! Could this be a data error?
Next, I’ll look at these quantiles only for collisions not reported on the same day as occurrence.
quantile(filter(gap, gap > 0)$gap)
## 0% 25% 50% 75% 100%
## 1 1 1 2 2922
Even among accidents that are not reported the same day, most are reported the day after.
# Plot daily collisions for 1 year
data_clean %>%
filter(substr(date_occ, 1, 4) == "2018") %>% # Limit to 2018
group_by(date_occ) %>% summarize(daily_total = n()) %>% ungroup() %>%
ggplot(aes(x = as.Date(date_occ), y = daily_total)) +
geom_line() + geom_point() +
scale_x_date(date_breaks = "1 month", date_labels = "%b") +
ylim(0, 250) +
labs(x = "", y = "Daily Collisions", title = "2018 Daily Collisions")
# Plot daily collisions reported for 1 year
# Decently different than collisions
data_clean %>%
filter(substr(date_rptd, 1, 4) == "2018") %>% # Limit to 2018
group_by(date_rptd) %>% summarize(daily_total = n()) %>% ungroup() %>%
ggplot(aes(x = as.Date(date_rptd), y = daily_total)) +
geom_line() + geom_point() +
scale_x_date(date_breaks = "1 month", date_labels = "%b") +
ylim(0, 250) +
labs(x = "", y = "Daily Collisions Reported", title = "2018 Daily Collisions Reported")
data_clean %>%
mutate(date_mod = ymd(paste0(substr(date_occ, 1, 7), "-01"))) %>%
group_by(date_mod) %>%
summarize(month_total = n()) %>%
ungroup() %>%
filter(date_mod != ymd("2019-08-01")) %>% # Throw out incomplete month
ggplot(aes(x = date_mod, y = month_total, group = 1)) +
geom_line() + geom_point() +
expand_limits(y = 0) +
labs(x = "", y = "Collisions", title = "Monthly Collisions")
data_clean %>%
mutate(date_mod = ymd(paste0(substr(date_rptd, 1, 7), "-01"))) %>%
group_by(date_mod) %>% summarize(month_total = n()) %>% ungroup() %>%
filter(date_mod != ymd("2019-08-01")) %>% # Throw out incomplete month
ggplot(aes(x = date_mod, y = month_total, group = 1)) +
geom_line() + geom_point() +
ylim(0, 5300) +
labs(x = "", y = "Collisions Reported", title = "Monthly Collisions Reported")
ggplot(data = select(data_clean, time_occ), aes(x = time_occ)) +
geom_histogram(binwidth = 50, color = "black") +
scale_x_continuous(breaks = seq(0, 2400, 200)) +
labs(x = "Time Occurred (24 Hour Format)", y = "Count", title = "Collisions by Time of Day")
The data does not include a timestamp for when collisions were reported.
data_clean %>%
mutate(day = weekdays(date_occ)) %>%
group_by(day) %>%
summarize(day_total = n()) %>%
ungroup() %>%
ggplot(aes(x = factor(day, levels = c("Sunday", "Monday", "Tuesday", "Wednesday", "Thursday", "Friday", "Saturday")),
y = day_total)) +
geom_bar(stat = "identity") +
labs(x = "", y = "Collisions", title = "Collisions by Day of Week")
# Collisions reported by day of week.
# Sunday still the lowest, Friday still the highest
# Weekdays are higher and even now
data_clean %>%
mutate(day = weekdays(date_rptd)) %>%
group_by(day) %>%
summarize(day_total = n()) %>%
ungroup() %>%
ggplot(aes(x = factor(day, levels = c("Sunday", "Monday", "Tuesday", "Wednesday", "Thursday", "Friday", "Saturday")),
y = day_total)) +
geom_bar(stat = "identity") +
ylim(0, 80000) +
labs(x = "", y = "Collisions Reported", title = "Collisions Reported by Day of Week")
data_clean %>%
mutate(month = as.numeric(substr(date_occ, 6, 7))) %>%
group_by(month) %>%
summarize(month_total = n()) %>%
ungroup() %>%
ggplot(aes(x = month, y = month_total)) +
geom_bar(stat = "identity") +
scale_x_continuous(breaks = c(1:12)) +
labs(x = "Month", y = "Collisions", title = "Collisions by Month")
# Collisions reported by month
# Very similar to collisions
data_clean %>%
mutate(month = as.numeric(substr(date_rptd, 6, 7))) %>%
group_by(month) %>%
summarize(month_total = n()) %>%
ungroup() %>%
ggplot(aes(x = month, y = month_total)) +
geom_bar(stat = "identity") +
scale_x_continuous(breaks = c(1:12)) +
ylim(0, 45000) +
labs(x = "Month", y = "Collisions Reported", title = "Collisions Reported by Month")
This section examines how traffic collisions are distributed geographically. It identifies high-risk areas and analyzes collisions geographically by daypart, weekpart, and time of year. I first plot the number of collisions per area.
data_clean %>%
group_by(area_name) %>%
summarize(total = n()) %>%
ungroup() %>%
ggplot(aes(x = reorder(area_name, -total), y = total)) +
geom_bar(stat = "identity") +
theme(axis.text.x = element_text(angle = 90)) +
labs(x = "", y = "Count", title = "Collision Count by Area")
Next, I look at the most commonly occurring location, cross_street, and location/cross_street combinations.
These are broad metrics, but can still be useful.
# Most commonly occurring "location"
data_clean %>%
group_by(location) %>% summarize(count = n()) %>% ungroup() %>%
mutate(percentage = round(100 * count / nrow(data_clean), 1)) %>%
arrange(-count) %>%
head(10)
## # A tibble: 10 x 3
## location count percentage
## <chr> <int> <dbl>
## 1 WESTERN AV 6377 1.3
## 2 VENTURA BL 5914 1.2
## 3 SHERMAN WY 5797 1.2
## 4 SEPULVEDA BL 5493 1.1
## 5 VERMONT AV 5346 1.1
## 6 VICTORY BL 5114 1.1
## 7 SUNSET BL 5112 1.1
## 8 FIGUEROA ST 4707 1
## 9 ROSCOE BL 4395 0.9
## 10 OLYMPIC BL 4280 0.9
# Most commonly occurring "cross_street"
data_clean %>%
group_by(cross_street) %>% summarize(count = n()) %>% ungroup() %>%
mutate(percentage = round(100 * count / nrow(data_clean), 1)) %>%
arrange(-count) %>%
head(10)
## # A tibble: 10 x 3
## cross_street count percentage
## <chr> <int> <dbl>
## 1 "" 21813 4.5
## 2 VERMONT AV 3633 0.7
## 3 FIGUEROA ST 3418 0.7
## 4 WESTERN AV 3413 0.7
## 5 SHERMAN WY 2873 0.6
## 6 SEPULVEDA BL 2761 0.6
## 7 VICTORY BL 2758 0.6
## 8 BROADWAY 2694 0.6
## 9 3RD ST 2615 0.5
## 10 PICO BL 2602 0.5
# Most commonly occurring "location" and "cross_street" combo
data_clean %>%
group_by(location, cross_street) %>% summarize(count = n()) %>% ungroup() %>%
mutate(percentage = round(100 * count / nrow(data_clean), 2)) %>%
arrange(-count) %>%
head(10)
## # A tibble: 10 x 4
## location cross_street count percentage
## <chr> <chr> <int> <dbl>
## 1 SHERMAN WY SEPULVEDA BL 247 0.05
## 2 TAMPA AV NORDHOFF ST 240 0.05
## 3 VAN NUYS BL ROSCOE BL 220 0.05
## 4 SHERMAN WY WOODMAN AV 215 0.04
## 5 SHERMAN WY WHITSETT AV 210 0.04
## 6 ROSCOE BL VAN NUYS BL 195 0.04
## 7 SLAUSON AV WESTERN AV 195 0.04
## 8 BURBANK BL SEPULVEDA BL 191 0.04
## 9 MANCHESTER AV FIGUEROA ST 182 0.04
## 10 SHERMAN WY BELLAIRE AV 177 0.04
Next I map collisions. First, I create an overall map.
# Create version of data for mapping
# Data is pretty cluttered so I do it for one year only
data_map = data_clean %>%
filter(substr(date_occ, 1, 4) == "2018" & # 2018 only
lat != 0 & lon != 0) %>% # Remove rows where "lat" and "lon" are both 0
group_by(lat, lon) %>% summarize(count = n()) %>% ungroup()
# Overall map
# Cluttered, not super useful. But shows the weird shape of Los Angeles
m = get_map(location = c(lon = mean(data_clean$lon), lat = mean(data_clean$lat)),
zoom = 10, maptype = "roadmap", scale = 2)
ggmap(m) + geom_point(aes(x = lon, y = lat, alpha = count, color = count), data = data_map) +
theme(axis.text = element_blank(),
axis.ticks = element_blank()) +
scale_alpha(guide = "none") +
scale_colour_gradient(low = "cornflowerblue", high = "red") +
labs(x = "", y = "", title = "Overall Map: 2018 Collisions", color = "Collisions")
To get a better view, I’ll zoom in on some areas. Note that many of the maps below overlap.
# Valley
m = get_map(location = c(lon = -118.47, lat = 34.3),
zoom = 11, maptype = "roadmap", scale = 2)
ggmap(m) + geom_point(aes(x = lon, y = lat, alpha = count, color = count), data = data_map) +
theme(axis.text = element_blank(),
axis.ticks = element_blank()) +
scale_alpha(guide = "none") +
scale_colour_gradient(low = "cornflowerblue", high = "red") +
labs(x = "", y = "", title = "The Valley: 2018 Collisions", color = "Collisions")
# East LA
m = get_map(location = c(lon = -118.285, lat = 34.0407),
zoom = 12, maptype = "roadmap", scale = 2)
ggmap(m) + geom_point(aes(x = lon, y = lat, alpha = count, color = count), data = data_map) +
theme(axis.text = element_blank(),
axis.ticks = element_blank()) +
scale_alpha(guide = "none") +
scale_colour_gradient(low = "cornflowerblue", high = "red") +
labs(x = "", y = "", title = "East LA: 2018 Collisions", color = "Collisions")
# Westside
m = get_map(location = c(lon = -118.415, lat = 34.0),
zoom = 12, maptype = "roadmap", scale = 2)
ggmap(m) + geom_point(aes(x = lon, y = lat, alpha = count, color = count), data = data_map) +
theme(axis.text = element_blank(),
axis.ticks = element_blank()) +
scale_alpha(guide = "none") +
scale_colour_gradient(low = "cornflowerblue", high = "red") +
labs(x = "", y = "", title = "West LA: 2018 Collisions", color = "Collisions")
# Long Beach
m = get_map(location = c(lon = -118.2937, lat = 33.7901),
zoom = 12, maptype = "roadmap", scale = 2)
ggmap(m) + geom_point(aes(x = lon, y = lat, alpha = count, color = count), data = data_map) +
theme(axis.text = element_blank(),
axis.ticks = element_blank()) +
scale_alpha(guide = "none") +
scale_colour_gradient(low = "cornflowerblue", high = "red") +
labs(x = "", y = "", title = "Long Beach: 2018 Collisions", color = "Collisions")
Next, I take the most common collision daypart for each coordinate and plot. The dayparts I use are:
The overall map is cluttered, so I jump straight to area maps.
# Create version of data for mapping average accident time
# Data is pretty cluttered so I do it for one year only
data_map_time = data_clean %>%
filter(substr(date_occ, 1, 4) == "2018" & # 2018 only
lat != 0 & lon != 0) %>% # Remove rows where "lat" and "lon" are both 0
mutate(daypart = case_when(time_occ >= 2200 | time_occ < 400 ~ "Late Night (10PM - 4AM)", # Create dayparts
time_occ >= 400 & time_occ < 800 ~ "Early Morning (4AM - 8AM)",
time_occ >= 800 & time_occ < 1200 ~ "Late Morning (8AM - 12PM)",
time_occ >= 1200 & time_occ < 1700 ~ "Afternoon (12PM - 5PM)",
TRUE ~ "Evening (5PM - 10PM)"),
daypart = factor(daypart, levels = c("Early Morning (4AM - 8AM)", "Late Morning (8AM - 12PM)",
"Afternoon (12PM - 5PM)", "Evening (5PM - 10PM)",
"Late Night (10PM - 4AM)"))) %>%
group_by(lat, lon) %>% mutate(total = n()) %>% ungroup() %>% # Total collisions per coordinate
group_by(lat, lon, total, daypart) %>% summarize(count = n()) %>% ungroup() %>% # Count collisions per coordinate / daypart
group_by(lat, lon) %>% mutate(max_val = max(count)) %>% ungroup() %>% # Only keep max collision dayparts
filter(count == max_val) %>%
group_by(lat, lon) %>% mutate(row_count = n()) %>% ungroup() %>% # Remove coordinates with equal collisions in
filter(row_count == 1) # different dayparts. This could lose some big coordinates
# Create color scale
daypart_values = values = c("pink", "orange", "indianred1", "purple", "black")
m = get_map(location = c(lon = -118.47, lat = 34.3),
zoom = 11, maptype = "roadmap", scale = 2)
ggmap(m) + geom_point(aes(x = lon, y = lat, alpha = count, color = daypart),
data = filter(data_map_time, total >= 5)) +
theme(axis.text = element_blank(),
axis.ticks = element_blank()) +
scale_alpha(guide = "none") +
scale_color_manual(values = daypart_values) +
facet_grid(cols = vars(daypart)) +
labs(x = "", y = "", title = "The Valley: 2018 Collisions by Daypart (Coordinates with 5+ Collisions)", color = "Most Common Daypart")
m = get_map(location = c(lon = -118.285, lat = 34.0407),
zoom = 12, maptype = "roadmap", scale = 2)
ggmap(m) + geom_point(aes(x = lon, y = lat, alpha = count, color = daypart),
data = filter(data_map_time, total >= 5)) +
theme(axis.text = element_blank(),
axis.ticks = element_blank()) +
scale_alpha(guide = "none") +
scale_color_manual(values = daypart_values) +
facet_grid(cols = vars(daypart)) +
labs(x = "", y = "", title = "East LA: 2018 Collisions by Daypart (Coordinates with 5+ Collisions)", color = "Most Common Daypart")
# Westside
m = get_map(location = c(lon = -118.415, lat = 34.0),
zoom = 12, maptype = "roadmap", scale = 2)
ggmap(m) + geom_point(aes(x = lon, y = lat, alpha = count, color = daypart),
data = filter(data_map_time, total >= 5)) +
theme(axis.text = element_blank(),
axis.ticks = element_blank()) +
scale_alpha(guide = "none") +
scale_color_manual(values = daypart_values) +
facet_grid(cols = vars(daypart)) +
labs(x = "", y = "", title = "West LA: 2018 Collisions by Daypart (Coordinates with 5+ Collisions)", color = "Most Common Daypart")
# Long Beach
m = get_map(location = c(lon = -118.2937, lat = 33.7901),
zoom = 12, maptype = "roadmap", scale = 2)
ggmap(m) + geom_point(aes(x = lon, y = lat, alpha = count, color = daypart),
data = filter(data_map_time, total >= 5)) +
theme(axis.text = element_blank(),
axis.ticks = element_blank()) +
scale_alpha(guide = "none") +
scale_color_manual(values = daypart_values) +
facet_grid(cols = vars(daypart)) +
labs(x = "", y = "", title = "Long Beach: 2018 Collisions by Daypart (Coordinates with 5+ Collisions)", color = "Most Common Daypart")
Next, I look at whether collisions mostly occur on weekdays or weekends geographically.
# Create version of data for mapping most common accident weekpart
# Data is pretty cluttered so I do it for one year only
data_map_week = data_clean %>%
filter(substr(date_occ, 1, 4) == "2018" & # 2018 only
lat != 0 & lon != 0) %>% # Remove rows where "lat" and "lon" are both 0
mutate(day = weekdays(date_occ),
week_type = if_else(day %in% c("Saturday", "Sunday"), "Weekend", "Weekday")) %>%
group_by(lat, lon) %>% mutate(total = n()) %>% ungroup() %>% # Total collisions per coordinate
group_by(lat, lon, total, week_type) %>% summarize(count = n()) %>% ungroup() %>% # Count collisions per coordinate / daypart
group_by(lat, lon) %>% mutate(max_val = max(count)) %>% ungroup() %>% # Only keep max collision dayparts
filter(count == max_val) %>%
group_by(lat, lon) %>% mutate(row_count = n()) %>% ungroup() %>% # Remove coordinates with equal collisions in
filter(row_count == 1) # different dayparts. This could lose some big coordinates
# Valley
m = get_map(location = c(lon = -118.47, lat = 34.3),
zoom = 11, maptype = "roadmap", scale = 2)
ggmap(m) + geom_point(aes(x = lon, y = lat, alpha = count, color = week_type), data = filter(data_map_week, total >= 3)) +
theme(axis.text = element_blank(),
axis.ticks = element_blank()) +
scale_alpha(guide = "none") +
facet_grid(cols = vars(week_type)) +
labs(x = "", y = "", title = "The Valley: 2018 Collisions by Weekpart (Coordinates with 3+ Collisions)", color = "Most Common Weekpart")
# East LA
m = get_map(location = c(lon = -118.285, lat = 34.0407),
zoom = 12, maptype = "roadmap", scale = 2)
ggmap(m) + geom_point(aes(x = lon, y = lat, alpha = count, color = week_type), data = filter(data_map_week, total >= 3)) +
theme(axis.text = element_blank(),
axis.ticks = element_blank()) +
scale_alpha(guide = "none") +
facet_grid(cols = vars(week_type)) +
labs(x = "", y = "", title = "East LA: 2018 Collisions by Weekpart (Coordinates with 3+ Collisions)", color = "Most Common Weekpart")
m = get_map(location = c(lon = -118.415, lat = 34.0),
zoom = 12, maptype = "roadmap", scale = 2)
ggmap(m) + geom_point(aes(x = lon, y = lat, alpha = count, color = week_type), data = filter(data_map_week, total >= 3)) +
theme(axis.text = element_blank(),
axis.ticks = element_blank()) +
scale_alpha(guide = "none") +
facet_grid(cols = vars(week_type)) +
labs(x = "", y = "", title = "West LA: 2018 Collisions by Weekpart (Coordinates with 3+ Collisions)", color = "Most Common Weekpart")
# Long Beach
m = get_map(location = c(lon = -118.2937, lat = 33.7901),
zoom = 12, maptype = "roadmap", scale = 2)
ggmap(m) + geom_point(aes(x = lon, y = lat, alpha = count, color = week_type), data = filter(data_map_week, total >= 3)) +
theme(axis.text = element_blank(),
axis.ticks = element_blank()) +
scale_alpha(guide = "none") +
facet_grid(cols = vars(week_type)) +
labs(x = "", y = "", title = "Long Beach: 2018 Collisions by Weekpart (Coordinates with 3+ Collisions)", color = "Most Common Weekpart")
# Create version of data for mapping most common accident season
# Data is pretty cluttered so I do it for one year only
data_map_season = data_clean %>%
filter(substr(date_occ, 1, 4) == "2018" & # 2018 only
lat != 0 & lon != 0) %>% # Remove rows where "lat" and "lon" are both 0
mutate(month = as.numeric(substr(date_occ, 6, 7)),
season = case_when(month %in% c(12, 1, 2) ~ "Dec-Feb",
month %in% c(3, 4, 5) ~ "Mar-May",
month %in% c(6, 7, 8) ~ "June-Aug",
TRUE ~ "Sep-Nov")) %>%
group_by(lat, lon) %>% mutate(total = n()) %>% ungroup() %>% # Total collisions per coordinate
group_by(lat, lon, total, season) %>% summarize(count = n()) %>% ungroup() %>% # Count collisions per coordinate / season
group_by(lat, lon) %>% mutate(max_val = max(count)) %>% ungroup() %>% # Only keep max collision dayparts
filter(count == max_val) %>%
group_by(lat, lon) %>% mutate(row_count = n()) %>% ungroup() %>% # Remove coordinates with equal collisions in
filter(row_count == 1) # different dayparts. This could lose some big coordinates
m = get_map(location = c(lon = -118.47, lat = 34.3),
zoom = 11, maptype = "roadmap", scale = 2)
ggmap(m) + geom_point(aes(x = lon, y = lat, alpha = count, color = season), data = filter(data_map_season, total >= 3)) +
theme(axis.text = element_blank(),
axis.ticks = element_blank()) +
scale_alpha(guide = "none") +
facet_grid(cols = vars(season)) +
labs(x = "", y = "", title = "The Valley: 2018 Collisions by Season (Coordinates with 3+ Collisions)", color = "Most Common Season")
m = get_map(location = c(lon = -118.285, lat = 34.0407),
zoom = 12, maptype = "roadmap", scale = 2)
ggmap(m) + geom_point(aes(x = lon, y = lat, alpha = count, color = season), data = filter(data_map_season, total >= 3)) +
theme(axis.text = element_blank(),
axis.ticks = element_blank()) +
scale_alpha(guide = "none") +
facet_grid(cols = vars(season)) +
labs(x = "", y = "", title = "East LA: 2018 Collisions by Season (Coordinates with 3+ Collisions)", color = "Most Common Season")
m = get_map(location = c(lon = -118.415, lat = 34.0),
zoom = 12, maptype = "roadmap", scale = 2)
ggmap(m) + geom_point(aes(x = lon, y = lat, alpha = count, color = season), data = filter(data_map_season, total >= 3)) +
theme(axis.text = element_blank(),
axis.ticks = element_blank()) +
scale_alpha(guide = "none") +
facet_grid(cols = vars(season)) +
labs(x = "", y = "", title = "West LA: 2018 Collisions by Season (Coordinates with 3+ Collisions)", color = "Most Common Season")
# Long Beach
m = get_map(location = c(lon = -118.2937, lat = 33.7901),
zoom = 12, maptype = "roadmap", scale = 2)
ggmap(m) + geom_point(aes(x = lon, y = lat, alpha = count, color = season), data = filter(data_map_season, total >= 3)) +
theme(axis.text = element_blank(),
axis.ticks = element_blank()) +
scale_alpha(guide = "none") +
facet_grid(cols = vars(season)) +
labs(x = "", y = "", title = "Long Beach: 2018 Collisions by Season (Coordinates with 3+ Collisions)", color = "Most Common Season")
This section explores predicting the number of collisions in a given time frame. Specifically, I predict the number of collisions per month per area.
Let’s see how this data looks for a given area.
# Get data
model_data = data_clean %>%
mutate(month = ymd(paste0(substr(date_occ, 1, 7), "-01"))) %>% # Create month variable
filter(month != ymd("2019-08-01")) %>% # Throw out partial month
group_by(month, area) %>% summarize(collisions = n()) %>% ungroup()
# Sample area plot
ggplot(data = filter(model_data, area == 2), aes(x = month, y = collisions)) +
geom_line() + geom_point() +
expand_limits(y = 0) +
labs(x = "", y = "Monthly Collisions", title = "Monthly Collisions for Area 2")
Let’s take a look at the decomposition for area 2.
# Sample area decomposition
ts(filter(model_data, area == 2)$collisions, frequency = 12) %>% decompose %>% autoplot(main = "Area 2 Decomposition")
I’ll also look at the decomposition for total monthly collisions (all areas).
# All area decomposition
ts((data_clean %>%
mutate(date_mod = ymd(paste0(substr(date_occ, 1, 7), "-01"))) %>%
group_by(date_mod) %>% summarize(month_total = n()) %>% ungroup() %>%
filter(date_mod != ymd("2019-08-01")))$month_total, frequency = 12) %>% decompose %>% autoplot(main = "Overall Data Decomposition")
Next, I look at the ACF and PACF for area 2.
ggAcf(filter(model_data, area == 2)$collisions) + labs(title = "ACF for Area 2")
ggPacf(filter(model_data, area == 2)$collisions) + labs(title = "PACF for Area 2")
I’ll try the following modeling approaches:
MA models are the simplest time series method and average past values to generate a prediction. ARIMA models use past values, differencing, and previous errors. Prophet is an additive forecasting method where non-linear trends are fit with yearly, weekly, and daily (not in this case) seasonality. It can also support additional external regressors.
I evaluate each model on the final 12 months of data (August 2018 to July 2019) with the following metrics:
The MAPE tells me the average magnitude of model error, while the bias tells me if the model is systemically over- or under-predicting.
Let’s look at the best ARIMA fit for area 2.
# Look at best p, q, d for ARIMA
auto.arima(filter(model_data, area == 2)$collisions)
## Series: filter(model_data, area == 2)$collisions
## ARIMA(0,1,1) with drift
##
## Coefficients:
## ma1 drift
## -0.8355 0.4845
## s.e. 0.0517 0.2413
##
## sigma^2 estimated as 227.3: log likelihood=-470.64
## AIC=947.29 AICc=947.51 BIC=955.5
all_preds = read_csv("all_preds.csv")
After fitting the models, I want to look at some metrics. First, I’ll look at the average MAPE and bias for each model.
# Table of mean overall performance per model
all_preds %>% summarize(mean_m3_mape = round(mean(m3_mape), 4),
mean_m6_mape = round(mean(m6_mape), 4),
mean_arima_mape = round(mean(arima_mape), 4),
mean_prophet_mape = round(mean(prophet_mape), 4),
mean_m3_bias = round(mean(m3_bias), 4),
mean_m6_bias = round(mean(m6_bias), 4),
mean_arima_bias = round(mean(arima_bias), 4),
mean_prophet_bias = round(mean(prophet_bias), 4)) %>%
data.frame()
## mean_m3_mape mean_m6_mape mean_arima_mape mean_prophet_mape mean_m3_bias
## 1 0.0796 0.0797 0.0797 0.0915 0.0113
## mean_m6_bias mean_arima_bias mean_prophet_bias
## 1 0.0137 0.0234 0.069
Let’s plot these results. First, I’ll look at the actual collisions and model predictions for area 2.
all_preds = mutate(all_preds, area = as.character(area))
# Look at data and preds for area 2
model_data %>%
mutate(area = as.character(area)) %>%
filter(area == 2 & month >= ymd("2018-05-01")) %>%
left_join(select(all_preds, -c(collisions, m3_mape, m3_bias, m6_mape, m6_bias, arima_mape, arima_bias, prophet_mape, prophet_bias)),
by = c("month", "area")) %>%
rename(Actual = collisions, ARIMA = arima_pred, `3 Month MA` = m3_pred, `6 Month MA` = m6_pred, Prophet = prophet_pred) %>%
gather(cat, val, -c(month, area)) %>%
mutate(cat = factor(cat, levels = c("3 Month MA", "6 Month MA", "ARIMA", "Prophet", "Actual"))) %>%
ggplot(aes(x = month, y = val, group = cat, color = cat)) +
geom_line() + geom_point() +
scale_color_manual(values = c("#F8766D", "#7CAE00", "#00BFC4", "#C77CFF", "black")) +
expand_limits(y = 50) +
scale_x_date(date_breaks = "1 month", date_labels = "%b-%y") +
theme(axis.text.x = element_text(angle = 90)) +
labs(x = "", y = "", title = "Actual Collisions and Model Predictions for Area 2")
# Average MAPE per model per month
all_preds %>%
group_by(month) %>% summarize(`3 Month MA` = mean(m3_mape),
`6 Month MA` = mean(m6_mape),
`ARIMA` = mean(arima_mape),
Prophet = mean(prophet_mape)) %>% ungroup() %>%
gather(key = "model", value = "error", -month) %>%
ggplot(aes(x = month, y = error, group = model, color = model)) +
geom_line() + geom_point() +
expand_limits(y = 0) +
scale_x_date(date_breaks = "1 month", date_labels = "%b-%y") +
theme(axis.text.x = element_text(angle = 90)) +
labs(x = "", y = "MAPE", title = "Avg. Monthly MAPE for Validation Data", color = "Model")
# Average bias per model per month
# Moving average model biases tend to move together
# Prophet model seems to typically over-estimate
all_preds %>%
group_by(month) %>% summarize(`3 Month MA` = mean(m3_bias),
`6 Month MA` = mean(m6_bias),
`ARIMA` = mean(arima_bias),
Prophet = mean(prophet_bias)) %>% ungroup() %>%
gather(key = "model", value = "error", -month) %>%
ggplot(aes(x = month, y = error, group = model, color = model)) +
geom_line() + geom_point() +
ylim(-0.1, 0.15) +
scale_x_date(date_breaks = "1 month", date_labels = "%b-%y") +
theme(axis.text.x = element_text(angle = 90)) +
labs(x = "", y = "Bias", title = "Avg. Monthly Bias for Validation Data", color = "Model")
# Average MAPE per model per area
# Chaotic but...
# Varying MAPEs by "area"
# Interesting edge cases where Prophet model is the only one that sucks ("area" 12)
# Or where only 3 and 6 month MA model suck ("area" 8)
all_preds %>%
group_by(area) %>% summarize(`3 Month MA` = mean(m3_mape),
`6 Month MA` = mean(m6_mape),
`ARIMA` = mean(arima_mape),
Prophet = mean(prophet_mape)) %>% ungroup() %>%
gather(key = "model", value = "error", -area) %>%
ggplot(aes(x = as.numeric(area), y = error, fill = model)) +
geom_bar(position = "dodge", stat = "identity") +
scale_x_continuous(breaks = 1:21) +
labs(x = "Area", y = "MAPE", title = "Avg. Area MAPE for Validation Data", fill = "Model")
# Average bias per model per area
# Chaotic but...
# Varying biases by "area"
# Interesting edge cases...
# Area 1, Prophet has positive bias, no other model does
# Area 17, 3 month MA has negative bias, no other model does
# Looks like Prophet model really stinks when it comes to bias
all_preds %>%
group_by(area) %>% summarize(`3 Month MA` = mean(m3_bias),
`6 Month MA` = mean(m6_bias),
`ARIMA` = mean(arima_bias),
Prophet = mean(prophet_bias)) %>% ungroup() %>%
gather(key = "model", value = "error", -area) %>%
ggplot(aes(x = as.numeric(area), y = error, fill = model)) +
geom_bar(position = "dodge", stat = "identity") +
scale_x_continuous(breaks = 1:21) +
labs(x = "Area", y = "Bias", title = "Avg. Area Bias for Validation Data", fill = "Model")
Let’s look at area 6.
# Look at data and preds for area 6
model_data %>%
mutate(area = as.character(area)) %>%
filter(area == 6 & month >= ymd("2018-05-01")) %>%
left_join(select(all_preds, -c(collisions, m3_mape, m3_bias, m6_mape, m6_bias, arima_mape, arima_bias, prophet_mape, prophet_bias)),
by = c("month", "area")) %>%
rename(Actual = collisions, ARIMA = arima_pred, `3 Month MA` = m3_pred, `6 Month MA` = m6_pred, Prophet = prophet_pred) %>%
gather(cat, val, -c(month, area)) %>%
mutate(cat = factor(cat, levels = c("3 Month MA", "6 Month MA", "ARIMA", "Prophet", "Actual"))) %>%
ggplot(aes(x = month, y = val, group = cat, color = cat)) +
geom_line() + geom_point() +
scale_color_manual(values = c("#F8766D", "#7CAE00", "#00BFC4", "#C77CFF", "black")) +
expand_limits(y = 50) +
scale_x_date(date_breaks = "1 month", date_labels = "%b-%y") +
theme(axis.text.x = element_text(angle = 90)) +
labs(x = "", y = "", title = "Actual Collisions and Model Predictions for Area 6")
Now, I’ll look at the worst prediction (MAPE) for each model.
filter(all_preds, m3_mape == max(all_preds$m3_mape)) %>%
select(-c(m3_pred, m6_pred, arima_pred, prophet_pred, m3_bias, m6_bias, arima_bias, prophet_bias))
## # A tibble: 1 x 7
## month area collisions m3_mape m6_mape prophet_mape arima_mape
## <date> <chr> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 2018-09-01 14 233 0.260 0.162 0.0895 0.183
filter(all_preds, m6_mape == max(all_preds$m6_mape)) %>%
select(-c(m3_pred, m6_pred, arima_pred, prophet_pred, m3_bias, m6_bias, arima_bias, prophet_bias))
## # A tibble: 1 x 7
## month area collisions m3_mape m6_mape prophet_mape arima_mape
## <date> <chr> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 2019-02-01 14 202 0.203 0.267 0.231 0.245
filter(all_preds, arima_mape == max(all_preds$arima_mape)) %>%
select(-c(m3_pred, m6_pred, arima_pred, prophet_pred, m3_bias, m6_bias, arima_bias, prophet_bias))
## # A tibble: 1 x 7
## month area collisions m3_mape m6_mape prophet_mape arima_mape
## <date> <chr> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 2019-01-01 2 147 0.254 0.265 0.290 0.286
filter(all_preds, prophet_mape == max(all_preds$prophet_mape)) %>%
select(-c(m3_pred, m6_pred, arima_pred, prophet_pred, m3_bias, m6_bias, arima_bias, prophet_bias))
## # A tibble: 1 x 7
## month area collisions m3_mape m6_mape prophet_mape arima_mape
## <date> <chr> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 2019-05-01 11 166 0.133 0.140 0.338 0.169
Let’s take a look at area 14.
# Look at data and preds for area 14
model_data %>%
filter(area == 14 & month >= ymd("2018-05-01")) %>%
left_join(select(all_preds, -c(collisions, m3_mape, m3_bias, m6_mape, m6_bias, arima_mape, arima_bias, prophet_mape, prophet_bias)),
by = c("month", "area")) %>%
rename(Actual = collisions, ARIMA = arima_pred, `3 Month MA` = m3_pred, `6 Month MA` = m6_pred, Prophet = prophet_pred) %>%
gather(cat, val, -c(month, area)) %>%
mutate(cat = factor(cat, levels = c("3 Month MA", "6 Month MA", "ARIMA", "Prophet", "Actual"))) %>%
ggplot(aes(x = month, y = val, group = cat, color = cat)) +
geom_line() + geom_point() +
scale_color_manual(values = c("#F8766D", "#7CAE00", "#00BFC4", "#C77CFF", "black")) +
expand_limits(y = 0) +
scale_x_date(date_breaks = "1 month", date_labels = "%b-%y") +
theme(axis.text.x = element_text(angle = 90)) +
labs(x = "", y = "", title = "Actual Collisions and Model Predictions for Area 14", color = "Model")
I’ll do some initial work on daily modeling.
all_daily_preds = read_csv("all_daily_preds.csv")
all_daily_preds = mutate(all_daily_preds,
date_occ = substr(as.character(all_daily_preds$date_occ), 1, 10),
area = as.character(area))
# Get data
daily_data = data_clean %>%
filter(date_occ < ymd("2019-08-01")) %>%
group_by(date_occ, area) %>% summarize(collisions = n()) %>% ungroup()
# Look at data and preds for area 2
daily_data %>%
mutate(date_occ = as.character(date_occ)) %>%
filter(area == 2 & date_occ >= ymd("2019-07-01")) %>%
left_join(select(all_daily_preds, -c(collisions, m3_mape, m3_bias, m6_mape, m6_bias, arima_mape, arima_bias, prophet_mape, prophet_bias)),
by = c("date_occ", "area")) %>%
rename(Actual = collisions, ARIMA = arima_pred, `3 Month MA` = m3_pred, `6 Month MA` = m6_pred, Prophet = prophet_pred) %>%
gather(cat, val, -c(date_occ, area)) %>%
mutate(cat = factor(cat, levels = c("3 Month MA", "6 Month MA", "ARIMA", "Prophet", "Actual")),
date_occ = ymd(date_occ)) %>%
ggplot(aes(x = date_occ, y = val, group = cat, color = cat)) +
geom_line() + geom_point() +
scale_color_manual(values = c("#F8766D", "#7CAE00", "#00BFC4", "#C77CFF", "black")) +
expand_limits(y = 0) +
labs(x = "", y = "", title = "Actual Collisions and Model Predictions for Area 2 (Daily Model)", color = "Model")
# Table of mean overall performance per model
all_daily_preds %>% summarize(mean_m3_mape = round(mean(m3_mape), 4),
mean_m6_mape = round(mean(m6_mape), 4),
mean_arima_mape = round(mean(arima_mape), 4),
mean_prophet_mape = round(mean(prophet_mape), 4),
mean_m3_bias = round(mean(m3_bias), 4),
mean_m6_bias = round(mean(m6_bias), 4),
mean_arima_bias = round(mean(arima_bias), 4),
mean_prophet_bias = round(mean(prophet_bias), 4)) %>%
data.frame()
## mean_m3_mape mean_m6_mape mean_arima_mape mean_prophet_mape mean_m3_bias
## 1 0.4772 0.4529 0.4761 0.4318 0.2163
## mean_m6_bias mean_arima_bias mean_prophet_bias
## 1 0.2188 0.215 0.2232
# Average MAPE per model per month
all_daily_preds %>%
group_by(date_occ) %>% summarize(`3 Month MA` = mean(m3_mape),
`6 Month MA` = mean(m6_mape),
`ARIMA` = mean(arima_mape),
Prophet = mean(prophet_mape)) %>% ungroup() %>%
gather(key = "model", value = "error", -date_occ) %>%
ggplot(aes(x = date_occ, y = error, group = model, color = model)) +
geom_line() + geom_point() +
expand_limits(y = 0) +
theme(axis.text.x = element_text(angle = 90)) +
labs(x = "", y = "MAPE", title = "Avg. Daily MAPE for Validation Data", color = "Model")
# Bias
all_daily_preds %>%
group_by(date_occ) %>% summarize(`3 Month MA` = mean(m3_bias),
`6 Month MA` = mean(m6_bias),
`ARIMA` = mean(arima_bias),
Prophet = mean(prophet_bias)) %>% ungroup() %>%
gather(key = "model", value = "error", -date_occ) %>%
ggplot(aes(x = date_occ, y = error, group = model, color = model)) +
geom_line() + geom_point() +
theme(axis.text.x = element_text(angle = 90)) +
labs(x = "", y = "Bias", title = "Avg. Monthly Bias for Validation Data", color = "Model")
# Average MAPE per model per area
all_daily_preds %>%
group_by(area) %>% summarize(`3 Month MA` = mean(m3_mape),
`6 Month MA` = mean(m6_mape),
`ARIMA` = mean(arima_mape),
Prophet = mean(prophet_mape)) %>% ungroup() %>%
gather(key = "model", value = "error", -area) %>%
ggplot(aes(x = as.numeric(area), y = error, fill = model)) +
geom_bar(position = "dodge", stat = "identity") +
scale_x_continuous(breaks = 1:21) +
labs(x = "Area", y = "MAPE", title = "Avg. Area MAPE for Validation Data", fill = "Model")
# Average bias per model per area
all_daily_preds %>%
group_by(area) %>% summarize(`3 Month MA` = mean(m3_bias),
`6 Month MA` = mean(m6_bias),
`ARIMA` = mean(arima_bias),
Prophet = mean(prophet_bias)) %>% ungroup() %>%
gather(key = "model", value = "error", -area) %>%
ggplot(aes(x = as.numeric(area), y = error, fill = model)) +
geom_bar(position = "dodge", stat = "identity") +
scale_x_continuous(breaks = 1:21) +
labs(x = "Area", y = "Bias", title = "Avg. Area Bias for Validation Data", fill = "Model")
This section lists next steps I would take to build on this analysis.